* --------------------------------------------------------------------------------------
* This script cleans Medicare data
* --------------------------------------------------------------------------------------

* Settings
version 16
do "$SSDIMed/scripts/_auxiliary/_project_settings.do"


* 1. Clean Denominator, Master Beneficiary Summary (Base) Files
if 1 {
  * 1a. Clean Denominator Files
  qui forvalues den_yr = 1992/2008 {
    * local den_yr 2001
    noisily di "Processing year `den_yr'"
    
    if inrange(`den_yr', 1992, 1995) local den_keepvars_`den_yr' ehic origbic hicov smicov zip sbdate death_dt validdt sex race hmocov oentitl centitl status esrd buyin
    if inrange(`den_yr', 1996, 1997) local den_keepvars_`den_yr' ehic drefyr dorgbic dpyri?? dzip5 dbdate death_dt dvddate dsex drace dcovmhmo dorec dcrec dmedsta desrdind dcovmsbc dcovmpa dcovmpb
    if inrange(`den_yr', 1998, 1999) local den_keepvars_`den_yr' ehic drefyr dorgbic dpyri?? dzip dbdate death_dt dvddate dsex drace dcovmhmo dorec dcrec dmedsta desrdind dcovmsbc dcovmpa dcovmpb
    if inrange(`den_yr', 2000, 2001) local den_keepvars_`den_yr' ehic drefyr dorgbic dpyri?? dzip dbdate death_dt dvddate dsex drace dcovmhmo dorec dcrec dmedsta desrdind dcovmsbc dcovmpa dcovmpb
    if inrange(`den_yr', 2002, 2005) local den_keepvars_`den_yr' ehic refyr entitl?* zip9 sdob death_sw death_dt sex race hmocov oentitl centitl msc esrd stbuyin hicov smicov
    if inrange(`den_yr', 2006, 2008) local den_keepvars_`den_yr' bene_id rfrnc_yr crnt_bic buyin?? bene_zip bene_dob v_dod_sw death_dt sex race hmo_mo orec crec ms_cd esrd_ind buyin_mo a_mo_cnt b_mo_cnt
    
    * First, load Denominator. Drop any ehics with duplicate entries
    use_pbzip2 `den_keepvars_`den_yr'' using $SSDIMed/data/raw/medicare/100pct/den/den`den_yr'.dta.bz2, clear
    
    * Denominator years 2002-2005 do not contain the full bic code variable
    if inrange(`den_yr', 2002, 2005)  {
      gen crnt_bic = ""
    }
    
    * Denominator years 2002-2005 do have equatable bic codes contained in the ehic values
    *   The first 9 positions of ehic are scrambled for hic and 10 and 11 are equatable BIC codes
    *if inrange(`den_yr', 2002, 2005)  {
    *  assert inrange(strlen(ehic), 10, 11)
    *  gen eqtbl_bic = substr(ehic,10,2) 
    *}
    *else {
    *  gen eqtbl_bic = ""
    *  replace 
    *}
    
    * ehic to bene_id
    capture confirm variable ehic
    if _rc == 0 {
      bys ehic: keep if _N==1
      
      * Merge in bene_id; use ehic values if no match to a bene_id
      merge_pbzip2 1:1 ehic using "$SSDIMed/data/raw/medicare/100pct/xw/ehicbenex_unique.dta.bz2", keep(match master) nogenerate keepusing(bene_id)
      assert bene_id != ehic
      replace bene_id = ehic if missing(bene_id)
      drop ehic
    }
    bys bene_id: keep if _N==1
    
    * File year
    cap drop rfrnc_yr
    gen rfrnc_yr = `den_yr'
    
    * Harmonize variable names and types
    cap rename origbic crnt_bic
    cap rename dorgbic crnt_bic
    cap rename hicov a_mo_cnt
    cap rename dcovmpa a_mo_cnt
    destring   a_mo_cnt, replace
    cap rename smicov b_mo_cnt
    cap rename dcovmpb b_mo_cnt
    destring   b_mo_cnt, replace
    cap rename zip bene_zip
    cap rename zip9 bene_zip
    cap rename dzip bene_zip
    cap rename dzip5 bene_zip
    cap rename sbdate bene_dob
    cap rename sdob bene_dob
    cap rename dbdate bene_dob
    cap rename sdod death_dt
    cap rename dddate death_dt
    cap rename validdt v_dod_sw
    cap rename death_sw v_dod_sw
    cap rename dvddate v_dod_sw
    cap rename hmocov hmo_mo
    cap rename dcovmhmo hmo_mo
    cap rename oentitl orec
    cap rename dorec orec
    cap rename centitl crec
    cap rename dcrec crec
    cap rename status ms_cd
    cap rename msc ms_cd
    cap rename dmedsta ms_cd
    cap rename esrd esrd_ind
    cap rename desrdind esrd_ind
    cap rename buyin buyin_mo
    cap rename stbuyin buyin_mo
    cap rename dcovmsbc buyin_mo
    cap rename dsex sex
    cap rename drace race
    cap rename dpyri01 buyin01
    cap rename dpyri02 buyin02
    cap rename dpyri03 buyin03
    cap rename dpyri04 buyin04
    cap rename dpyri05 buyin05
    cap rename dpyri06 buyin06
    cap rename dpyri07 buyin07
    cap rename dpyri08 buyin08
    cap rename dpyri09 buyin09
    cap rename dpyri10 buyin10
    cap rename dpyri11 buyin11
    cap rename dpyri12 buyin12
    cap rename entitl1  buyin01
    cap rename entitl2  buyin02
    cap rename entitl3  buyin03
    cap rename entitl4  buyin04
    cap rename entitl5  buyin05
    cap rename entitl6  buyin06
    cap rename entitl7  buyin07
    cap rename entitl8  buyin08
    cap rename entitl9  buyin09
    cap rename entitl10 buyin10
    cap rename entitl11 buyin11
    cap rename entitl12 buyin12
    
    label var bene_id  "Beneficiary ID"
    label var rfrnc_yr "File year"
    label var crnt_bic "Current Beneficiary Identification Code"
    label var bene_dob "Beneficiary Date of Birth"
    label var v_dod_sw "Valid Date of Death Switch"
    label var death_dt "Beneficiary Date of Death"
    label var sex      "Sex"
    label var race     "Beneficiary Race Code"
    label var orec     "Original Reason for Entitlement Code"
    label var crec     "Current Reason for Entitlement Code"
    label var esrd_ind "End-Stage Renal Disease (ESRD) Indicator"
    label var ms_cd    "Beneficiary Medicare status code"
    label var a_mo_cnt "Hospital Insurance (HI) Coverage Months Count"
    label var b_mo_cnt "Supplemental Medical Insurance (SMI) Coverage Months Count"
    label var buyin_mo "State Buy-In (SBI) Coverage Months"
    label var hmo_mo   "Health Maintenance Organization (HMO) Coverage Months"
    desc
    
    * Recorded deaths all occur within the calendar year and (generally) the first quarter of the following calendar year
    replace death_dt = . if year(death_dt)>`den_yr'
    assert year(death_dt)==`den_yr' if !missing(death_dt)
    
    * Coverage start date
    
    * Method 1. Start of first month in reference year with any coverage buyin
    capture confirm variable buyin01
    if _rc == 0 {
      assert buyin01!="0" | buyin02!="0" | buyin03!="0" | buyin04!="0" | buyin05!="0" | buyin06!="0" | buyin07!="0" | buyin08!="0" | buyin09!="0" | buyin10!="0" | buyin11!="0" | buyin12!="0"
      gen covstart_buyin = date("12-01-`den_yr'","MDY") if buyin12!="0"
      foreach m in 11 10 09 08 07 06 05 04 03 02 01 {
        replace covstart_buyin = date("`m'-01-`den_yr'","MDY") if buyin`m'!="0"
      }
      drop buyin??
      assert !missing(covstart_buyin)
    }
    else {
      gen covstart_buyin = .
    }
    format covstart_buyin %d
    tab covstart_buyin
    label var covstart_buyin "Coverage start in year, based on first month with buyin"
    
    * Method 2. Infer the start month of coverage as 13, less max(#months enrolled in Part A,#months enrolled in Part B)+(#months dead, and therefore not enrolled)
    * Note: In 1999, when we know the true start date (based on exact month indicators), this procedure produces the true start month in 99.986% of cases
    gen fullmonthsdead = (12-month(death_dt))
    replace fullmonthsdead = 0 if missing(fullmonthsdead) | (year(death_dt)>`den_yr')
    gen covstartmo = max(13 - (max(a_mo_cnt,b_mo_cnt) + fullmonthsdead), 1)
    drop fullmonthsdead
    assert inrange(covstartmo, 1, 12)
    tab covstartmo
    
    * Assume that people who were enrolled in Medicare in this year and previous year were eligible on Jan 1 of this year
    * Note 1: in 1999 (where we know exact start month in the year), 99.994% of observations matching BSF 1998 did in fact have a start date of Jan 1.
    * Note 2: Only possible 1993+, since don't have denominator prior to 1992.
    * Note 3: In 1993, this change only changes the definition of covstartmo in 22k cases out of 35M, i.e. less than 0.06% of cases, so this step is almost inconsequential
    local den_yr_m1 = `den_yr'-1
    capture confirm file "$SSDIMed/data/raw/medicare/100pct/den/den`den_yr_m1'.dta.bz2"
    if _rc == 0 {
      preserve
      use_pbzip2 using $SSDIMed/data/raw/medicare/100pct/den/den`den_yr_m1'.dta.bz2, clear
      
      capture confirm variable ehic
      if _rc == 0 {
        bys ehic: keep if _n==1
        merge_pbzip2 1:1 ehic using "$SSDIMed/data/raw/medicare/100pct/xw/ehicbenex_unique.dta.bz2", keep(match master) nogenerate keepusing(bene_id)
        assert bene_id != ehic
        replace bene_id = ehic if missing(bene_id)
      }
      bys bene_id: keep if _n==1
      tempfile den_`den_yr_m1'
      saveold "`den_`den_yr_m1''"
      restore
    
      merge 1:1 bene_id using "`den_`den_yr_m1''", keep(match master) keepusing(bene_id)
      replace covstartmo = 1 if _merge==3
      drop _merge
    }
    
    * Generate the coverage start date, which is the first day of the coverage start month
    gen covstart_count = mdy(covstartmo,01,`den_yr')
    drop covstartmo
    format covstart_count %d
    tab covstart_count
    assert !missing(covstart_count)
    label var covstart_count "Coverage start in year, inferred from months of coverage/death"
    
    * Compare methods 1 and 2
    tab covstart_buyin covstart_count
    count if missing(covstart_buyin)
    if `r(N)' < _N {
      count if covstart_buyin == covstart_count
      noisily di "alignment of covstart vars: " `r(N)'/_N
      assert `r(N)'/_N > 0.9998
    }
    
    * Age at end of current year, as if you remained alive through end of year (whether or not you actually were alive at end)
    cap drop age
    generate age = rfrnc_yr - year(bene_dob)
    label var age "Age at end of the year"
    
    * Age notes:
    *   There seem to be coding problems in high ages.  For example, in 2012, ~30% of 100 years-olds die, but only ~4% of 109 years-olds die (see http://www.ssa.gov/oact/STATS/table4c6.html for SSA life table)
    *   In fact, Medicare caps its own "age" variable at 115 in later years, because of these issues
    
    * Format date variables
    format death_dt bene_dob %d
    
    * Convert sex and race consistently to string
    tostring sex, replace
    tostring race, replace
    destring hmo_mo, replace
    
    * Convert sex to numeric, with labels
    destring sex, replace
    label define sex 0 "Unknown" 1 "Male" 2 "Female"
    label values sex sex
    
    * Label reasons for entitlement
    destring orec crec, replace
    label define rec 0 "OASI" 1 "DIB" 2 "ESRD" 3 "DIB and ESRD" 4 "Unknown", replace
    label values orec crec rec
    
    * Create 5-digit ZIPs
    destring bene_zip, replace
    rename bene_zip zip_num
    sum zip_num
    if `r(max)' >= 100000 {
      gen bene_zip9 = string(zip_num, "%09.0f")
      gen bene_zip5 = substr(bene_zip9, 1, 5)
      drop bene_zip9
    }
    else {
      gen bene_zip5 = string(zip_num, "%05.0f")
    }
    drop zip_num
    replace bene_zip5 = "" if bene_zip5 == "."
    label var bene_zip5 "5-digit ZIP Code"
    
    * Convert buyin_mo to numeric (string in earlier years)
    destring buyin_mo, replace
    
    * Convert a_mo_cnt and b_mo_cnt to numeric (string in some years)
    destring a_mo_cnt, replace
    destring b_mo_cnt, replace
    
    * Tidy up
    order bene_id rfrnc_yr bene_dob death_dt v_dod_sw age race sex crnt_bic bene_zip5 esrd_ind covstart* a_mo_cnt b_mo_cnt buyin_mo hmo_mo
    gisid bene_id
    
    * Save cleaned base beneficiary summary file
    cap mkdir "$SSDIMed/data/proc/medicare"
    cap mkdir "$SSDIMed/data/proc/medicare/100pct"
    cap mkdir "$SSDIMed/data/proc/medicare/100pct/den"
    save "$SSDIMed/data/proc/medicare/100pct/den/den`den_yr'.dta", replace
    pbzip2 using "$SSDIMed/data/proc/medicare/100pct/den/den`den_yr'.dta", replace
  }

  * 1b. Clean Master Beneficiary Summary (Base) Files
  qui foreach mbsf_yr in 1999 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017  {
    * local mbsf_yr 2007
    noisily di "Processing year `mbsf_yr'"
    
    if inrange(`mbsf_yr', 1999, 1999) local bsfab_keepvars_`mbsf_yr' bene_id rfrnc_yr crnt_bic buyin?? covstart bene_zip age bene_dob v_dod_sw death_dt sex race hmo_mo orec crec esrd_ind buyin_mo a_mo_cnt b_mo_cnt
    if inrange(`mbsf_yr', 2006, 2016) local bsfab_keepvars_`mbsf_yr' bene_id rfrnc_yr crnt_bic buyin?? covstart zip_cd   age bene_dob v_dod_sw death_dt sex race hmo_mo orec crec esrd_ind buyin_mo a_mo_cnt b_mo_cnt
    if inrange(`mbsf_yr', 2017, 2017) local bsfab_keepvars_`mbsf_yr' bene_id rfrnc_yr crnt_bic buyin?? covstart zip_cd   age bene_dob v_dod_sw death_dt sex race hmo_mo orec crec esrd_ind buyin_mo a_mo_cnt b_mo_cnt
    
    * First, load Beneficiary Summary File: A/B. Drop any ehics with duplicate entries
    use_pbzip2 `bsfab_keepvars_`mbsf_yr'' using "$SSDIMed/data/raw/medicare/100pct/bsf/bsfab`mbsf_yr'.dta.bz2", clear
    bys bene_id: keep if _N==1
    
    * File year
    assert rfrnc_yr == `mbsf_yr'
    
    * Harmonize variable names and types
    cap rename zip_cd bene_zip
    destring a_mo_cnt, replace
    destring b_mo_cnt, replace
    
    label var bene_id  "Beneficiary ID"
    label var rfrnc_yr "File year"
    label var crnt_bic "Current Beneficiary Identification Code"
    label var bene_dob "Beneficiary Date of Birth"
    label var v_dod_sw "Valid Date of Death Switch"
    label var death_dt "Beneficiary Date of Death"
    label var sex      "Sex"
    label var race     "Beneficiary Race Code"
    label var covstart "Medicare Coverage Start Date"
    label var orec     "Original Reason for Entitlement Code"
    label var crec     "Current Reason for Entitlement Code"
    label var esrd_ind "End-Stage Renal Disease (ESRD) Indicator"
    label var a_mo_cnt "Hospital Insurance (HI) Coverage Months Count"
    label var b_mo_cnt "Supplemental Medical Insurance (SMI) Coverage Months Count"
    label var buyin_mo "State Buy-In (SBI) Coverage Months"
    label var hmo_mo   "Health Maintenance Organization (HMO) Coverage Months"
    
    * Recorded deaths all occur within the calendar year and (generally) the first quarter of the following calendar year
    replace death_dt = . if year(death_dt)>`mbsf_yr'
    assert year(death_dt)==`mbsf_yr' if !missing(death_dt)
    
    * Coverage start date
    
    * Method 0. Admin covstart
    count if !missing(covstart)
    assert `r(N)'/_N > 0.99999
    assert day(covstart)==1 if !missing(covstart)
    format covstart %d
    
    * Method 1. Start of first month in reference year with any coverage buyin
    capture confirm variable buyin01
    if _rc == 0 {
      assert buyin01!="0" | buyin02!="0" | buyin03!="0" | buyin04!="0" | buyin05!="0" | buyin06!="0" | buyin07!="0" | buyin08!="0" | buyin09!="0" | buyin10!="0" | buyin11!="0" | buyin12!="0"
      generat covstart_buyin = date("12-01-`mbsf_yr'","MDY") if buyin12!="0"
      foreach m in 11 10 09 08 07 06 05 04 03 02 01 {
        replace covstart_buyin = date("`m'-01-`mbsf_yr'","MDY") if buyin`m'!="0"
      }
      format covstart_buyin %d
      drop buyin??
    }
    tab covstart_buyin
    assert !missing(covstart_buyin)
    label var covstart_buyin "Coverage start in year, based on first month with buyin"
    
    * Method 2. Infer the start month of coverage as 13, less max(#months enrolled in Part A,#months enrolled in Part B)+(#months dead, and therefore not enrolled)
    * Note: In 1999, when we know the true start date (based on exact month indicators), this procedure produces the true start month in 99.986% of cases
    gen fullmonthsdead = (12-month(death_dt))
    replace fullmonthsdead = 0 if missing(fullmonthsdead) | (year(death_dt)>`mbsf_yr')
    gen covstartmo = max(13 - (max(a_mo_cnt,b_mo_cnt) + fullmonthsdead), 1)
    drop fullmonthsdead
    assert inrange(covstartmo, 1, 12)
    tab covstartmo
    
    * Assume that people who were enrolled in Medicare in this year and previous year were eligible on Jan 1 of this year
    * Note 1: in 1999 (where we know exact start month in the year), 99.994% of observations matching BSF 1998 did in fact have a start date of Jan 1.
    * Note 2: Only possible 1993+, since don't have denominator prior to 1992.
    * Note 3: In 1993, this change only changes the definition of covstartmo in 22k cases out of 35M, i.e. less than 0.06% of cases, so this step is almost inconsequential
    local mbsf_yr_m1 = `mbsf_yr'-1
    capture confirm file "$SSDIMed/data/raw/medicare/100pct/bsf/bsfab`mbsf_yr_m1'.dta.bz2"
    if _rc == 0 {
      preserve
      use_pbzip2 using $SSDIMed/data/raw/medicare/100pct/bsf/bsfab`mbsf_yr_m1'.dta.bz2, clear
      
      bys bene_id: keep if _n==1
      tempfile bsf_`mbsf_yr_m1'
      saveold "`bsf_`mbsf_yr_m1''"
      restore
    
      merge 1:1 bene_id using "`bsf_`mbsf_yr_m1''", keep(match master) keepusing(bene_id)
      replace covstartmo = 1 if _merge==3
      drop _merge
    }
    
    * Generate the coverage start date, which is the first day of the coverage start month
    gen covstart_count = mdy(covstartmo,01,`mbsf_yr')
    drop covstartmo
    format covstart_count %d
    tab covstart_count
    assert !missing(covstart_count)
    label var covstart_count "Coverage start in year, inferred from months of coverage/death"
    
    * Compare methods 1 and 2
    tab covstart_buyin covstart_count
    count if missing(covstart_buyin, covstart_count)
    count if covstart_buyin == covstart_count
    * Note: 1999 and 2006 are the two years without an mbsf file in the preceeding year, suggesting that accounting for prior year helps slightly
    noisily di "alignment of covstart vars: " `r(N)'/_N
    if inlist(`mbsf_yr', 1999, 2006) assert `r(N)'/_N > 0.9987
    else                             assert `r(N)'/_N > 0.99975
    
    * Age at end of current year, as if you remained alive through end of year (whether or not you actually were alive at end)
    cap drop age
    generate age = rfrnc_yr - year(bene_dob)
    label var age "Age at end of the year"
    
    * Age notes:
    *   There seem to be coding problems in high ages.  For example, in 2012, ~30% of 100 years-olds die, but only ~4% of 109 years-olds die (see http://www.ssa.gov/oact/STATS/table4c6.html for SSA life table)
    *   In fact, Medicare caps its own "age" variable at 115 in later years, because of these issues
    *   In 1995, drops 0.28% of obs
    
    * Format date variables
    format death_dt bene_dob %d
    
    * Convert sex and race consistently to string
    tostring sex, replace
    tostring race, replace
    destring hmo_mo, replace
    
    * Convert sex to numeric, with labels
    destring sex, replace
    label define sex 0 "Unknown" 1 "Male" 2 "Female"
    label values sex sex
    
    * Label reasons for entitlement
    destring orec crec, replace
    label define rec 0 "OASI" 1 "DIB" 2 "ESRD" 3 "DIB and ESRD" 4 "Unknown", replace
    label values orec crec rec
    
    * Create 5-digit ZIPs
    destring bene_zip, replace
    rename bene_zip zip_num
    sum zip_num
    if `r(max)' >= 100000 {
      gen bene_zip9 = string(zip_num, "%09.0f")
      gen bene_zip5 = substr(bene_zip9, 1, 5)
      drop bene_zip9
    }
    else {
      gen bene_zip5 = string(zip_num, "%05.0f")
    }
    drop zip_num
    replace bene_zip5 = "" if bene_zip5 == "."
    label var bene_zip5 "5-digit ZIP Code"
    
    * Convert buyin_mo to numeric (string in earlier years)
    destring buyin_mo, replace
    
    * Convert a_mo_cnt and b_mo_cnt to numeric (string in some years)
    destring a_mo_cnt, replace
    destring b_mo_cnt, replace
    
    * Tidy up
    order bene_id rfrnc_yr bene_dob death_dt v_dod_sw age race sex crnt_bic bene_zip5 esrd_ind covstart* a_mo_cnt b_mo_cnt buyin_mo hmo_mo
    gisid bene_id
    
    * Save cleaned base beneficiary summary file
    cap mkdir "$SSDIMed/data/proc/medicare"
    cap mkdir "$SSDIMed/data/proc/medicare/100pct"
    cap mkdir "$SSDIMed/data/proc/medicare/100pct/bsf"
    save "$SSDIMed/data/proc/medicare/100pct/bsf/bsfab`mbsf_yr'.dta", replace
    pbzip2 using "$SSDIMed/data/proc/medicare/100pct/bsf/bsfab`mbsf_yr'.dta", replace
  }

}


* 2. Create a zip-to-modal(county)/modal(state) crosswalk file
if 1 {
  * Output: final file includes only zipcodes for which ssacounty matches fipscounty in the CMS SSA-FIPS crosswalk file
  * File: Zip codes match to unique state-county pair, using observations across all years
  
  * ZIP to County and State association
  * Since zipcodes may occasionally cross county lines, calculate how frequently each 5-digit zip is associated with each county/state
  qui forvalues file_yr = 1992/2017 {
    * local file_yr 2017
    noisily di "Processing year `file_yr'"
    
    if inrange(`file_yr', 1992, 1995) local file_keepvars_`file_yr' zip county state
    if inrange(`file_yr', 1996, 1999) local file_keepvars_`file_yr' dzip5 dcounty dstate
    if inrange(`file_yr', 1998, 2001) local file_keepvars_`file_yr' dzip dcounty dstate
    if inrange(`file_yr', 2002, 2005) local file_keepvars_`file_yr' zipcode county state
    if inrange(`file_yr', 2006, 2006) local file_keepvars_`file_yr' bene_zip cnty_cd state_cd
    if inrange(`file_yr', 2007, 2017) local file_keepvars_`file_yr' zip_cd cnty_cd state_cd
    
    * Following sample construction for ENROLLMENT + ESRD (DI), use denominator for file years 1992-2006, bsf for file years 2007-2017
    if      inrange(`file_yr', 1992, 2006) use_pbzip2 `file_keepvars_`file_yr'' using "$SSDIMed/data/raw/medicare/100pct/den/den`file_yr'.dta.bz2", clear
    else if inrange(`file_yr', 2007, 2017) use_pbzip2 `file_keepvars_`file_yr'' using "$SSDIMed/data/raw/medicare/100pct/bsf/bsfab`file_yr'.dta.bz2", clear
    
    * Clean variable names, formats
    cap rename zip bene_zip
    cap rename dzip5 bene_zip
    cap rename dzip bene_zip
    cap rename zipcode bene_zip
    cap rename zip_cd bene_zip
    cap rename cnty_cd county
    cap rename dcounty county
    cap rename state_cd state
    cap rename dstate state
    
    capture confirm string variable bene_zip
    if _rc tostring bene_zip, replace
    replace bene_zip = "" if bene_zip=="."
    replace bene_zip = "0000" + bene_zip if strlen(bene_zip)==1
    replace bene_zip =  "000" + bene_zip if strlen(bene_zip)==2
    replace bene_zip =   "00" + bene_zip if strlen(bene_zip)==3
    replace bene_zip =    "0" + bene_zip if strlen(bene_zip)==4
    assert strlen(bene_zip) >= 5 if !missing(bene_zip)
    gen zip5 = substr(bene_zip, 1, 5)
    drop bene_zip
    
    capture confirm string variable county
    if _rc tostring county, replace
    replace county = "" if county=="."
    replace county = "00" + county if strlen(county)==1
    replace county =  "0" + county if strlen(county)==2
    assert strlen(county) == 3 if !missing(county)
    
    capture confirm string variable state
    if _rc tostring state, replace
    replace state = "" if state=="."
    replace state = "0" + state if strlen(state)==1
    assert strlen(state) == 2 if !missing(state)
    
    * Drop clearly missing zip, state, county codes
    drop if (zip5=="99999") | (zip5=="00000") | missing(zip5)
    drop if missing(county)
    drop if missing(state)

    * county/state digits should be 3 and 2, respectively
    assert strlen(zip5) == 5 & strlen(state) == 2 & strlen(county) == 3
    
    * Convert SSA county to 5-digit version, with state appended
    gen ssacounty = state + county
    assert strlen(ssacounty) == 5
    drop state county
    
    * Cross-walk in state and county FIPS codes, using CMS's SSA-FIPS crosswalk file maintained by NBER
    merge m:1 ssacounty using $SSDIMed/data/raw/nber/ssa-fips-state-county-crosswalk/ssa_fips_state_county2017.dta, keep(match) nogenerate noreport keepusing(state county fipsstate fipscounty cbsa cbsaname ssastate ssacounty)
    
    * Count the number of times each ssacounty (county-state_ is associated with each zip5
    sort ssacounty
    local county_vars county state fipscounty cbsa cbsaname ssastate fipsstate 
    foreach var in `county_vars' {
      by ssacounty: assert `var' == `var'[1]
    }
    gen rfrnc_yr = `file_yr'
    gcollapse (first) rfrnc_yr, by(zip5 ssacounty `county_vars') freq(N_cty_st) labelformat(#sourcelabel#)
    gisid zip5 ssacounty
    
    label var zip5 "Beneficiary 5-digit ZIP code"
    label var ssacounty "SSA county (2-digit state + 3-digit county)"
    label var N_cty_st "Frequency of zip5-ssacounty pair"

    * Save
    order zip5 ssacounty N_cty_st rfrnc_yr
    compress
    saveold "$SSDIMed/data/temp/tmp_zcs_`file_yr'", replace
  }
  * Combine all years
  clear *
  forvalues file_yr = 1992/2017 {
    append using "$SSDIMed/data/temp/tmp_zcs_`file_yr'"
  }
  saveold "$SSDIMed/data/temp/tmp_zcs", replace
  
  * UNIQUE CW
  * ZIP to County and State crosswalk
  * Unique zip to region match, does NOT vary by year
  * Since zipcodes may occasionally cross county lines, calculate modal state and county for each 5-digit zipcode
  if 1 {
    use "$SSDIMed/data/temp/tmp_zcs", clear
    
    * county/state digits should be 3 and 2, respectively
    assert strlen(zip5) == 5 & strlen(ssacounty) == 5
    assert !missing(fipscounty)

    * collapse to zip5-ssacounty level, i.e. do not distinguish by year
    sort ssacounty
    local county_vars county state fipscounty cbsa cbsaname ssastate fipsstate 
    foreach var in `county_vars' {
      by ssacounty: assert `var' == `var'[1]
    }
    gcollapse (sum) N_cty_st, by(zip5 ssacounty `county_vars') labelformat(#sourcelabel#)
    gisid zip5 ssacounty
    
    * For each zip, keep the state-county pair with the most observations
    *   In case of ties, keep the lowest numbered county (a la egen's minmode)
    gsort zip5 N_cty_st -ssacounty
    by zip5: gen modal_county = ssacounty[_N]
    assert !missing(ssacounty)
    by zip5: gen modal_state = state[_N]
    
    * Calculate the fraction of obs (benes) with ZIP5 in the modal county/state
    by zip5: gegen in_zip = total(N_cty_st)
    by zip5: gegen in_modal_county = total(N_cty_st * (modal_county == ssacounty))
    replace in_modal_county = in_modal_county/in_zip
    by zip5: gegen in_modal_state = total(N_cty_st * (modal_state == state))
    replace in_modal_state = in_modal_state/in_zip
    label var in_modal_county "Fraction of benes with ZIP5 in the modal county"
    label var in_modal_state "Fraction of benes with ZIP5 in the modal state"
    
    * Limit to one obs per zip
    bys zip5: keep if modal_county == ssacounty
    label var ssacounty "Modal SSA county matching ZIP5 in Medicare data (2-digit state _ 3-digit county)"
    assert modal_state == state
    label var ssastate "Modal SSA State matching ZIP5 in Medicare data"
    drop modal_county modal_state
    sum in_modal_county [aw = N_cty_st], d
    sum in_modal_state  [aw = N_cty_st], d

    * Keep select variables
    keep zip5 ssacounty county state fipscounty cbsa cbsaname ssastate fipsstate in_modal_county in_modal_state
    
    * Save
    compress
    order zip5 in_modal_county in_modal_state county state ssacounty ssastate fipscounty fipsstate cbsa cbsaname 
    rename zip5 bene_zip5
    gisid bene_zip5
    sort bene_zip5
    !mkdir -p "$SSDIMed/data/proc/medicare/cw"
    saveold "$SSDIMed/data/proc/medicare/cw/zip_cty_st_unique_1992-2017", replace
  }
  
  * Cleanup temp files
  forvalues file_yr = 1992/2017 {
    rm "$SSDIMed/data/temp/tmp_zcs_`file_yr'.dta"
  }
  rm "$SSDIMed/data/temp/tmp_zcs.dta"
  
}  


* 3. COHORT (DI). Define cohorts of Medicare non-elderly recipients
if 1 {

  * ------------------------------------------------------------------
  * Identify non-elderly in Medicare eligibility files
  * ------------------------------------------------------------------

  * Individuals getting Medicare coverage prior to normal elderly eligibility age.  
  *   Keep only individuals ages 65 or under who gained Medicare coverage prior to the date first eligible under elderly coverage
  *   Note: you become eligible for elderly Medicare on the first of the month of the day before you turn 65.
  *   I.e. elderly eligibility begins on mdy(month(bene_dob-1),1,year(bene_dob-1)+65)
  *   Reference: https://www.medicare.gov/sign-up-change-plans/get-parts-a-and-b/when-coverage-starts/when-coverage-starts.html
  *   "Your coverage starts the first day of the month you turn 65, unless your birthday is on the first day of the month.
  *   Example 1: Mr. Green's 65th birthday is July 20, 2017. If he signs up for Medicare in April, May, or June, his coverage will start on July 1, 2017.
  *   Example 2: Mr. Kim's 65th birthday is July 1, 2017. If he signs up for Medicare in March, April, or May, his coverage will start on June 1, 2017."

  *   To see this empirically, run the following code in, say, 2010 when covstart is included in the denominator file
  *     use $SSDIMed/data/proc/medicare/100pct/bsf/bsf_100pct_ab_`mbsf_yr'.dta, clear
  *     count
  *     count if day(bene_dob)==1
  *     generat timing = 0 if covstart>mdy(month(bene_dob),1,year(bene_dob)+65) & day(bene_dob)==1
  *     replace timing = 1 if covstart==mdy(month(bene_dob),1,year(bene_dob)+65) & day(bene_dob)==1
  *     replace timing = 2 if covstart==mdy(month(bene_dob-1),1,year(bene_dob-1)+65) & day(bene_dob)==1
  *     replace timing = 3 if covstart<mdy(month(bene_dob-1),1,year(bene_dob-1)+65) & day(bene_dob)==1
  *     replace timing = 0 if covstart>mdy(month(bene_dob),1,year(bene_dob)+65) & day(bene_dob)>1
  *     replace timing = 1 if covstart==mdy(month(bene_dob),1,year(bene_dob)+65) & day(bene_dob)>1
  *     replace timing = 2 if covstart==mdy(month(bene_dob-1),1,year(bene_dob-1)+65) & day(bene_dob)>1
  *     replace timing = 3 if covstart<mdy(month(bene_dob-1),1,year(bene_dob-1)+65) & day(bene_dob)>1
  *     gen day1 = day(bene_dob)==1
  *     tab timing day1

  * Denominator files
  qui forvalues den_yr = 1992/2008 {
    * local den_yr 1992
    noisily di "Working on year `den_yr'"

    local keepvars bene_id rfrnc_yr age race sex covstart* orec *bic bene_dob bene_zip5
    use_pbzip2 `keepvars' using $SSDIMed/data/proc/medicare/100pct/den/den`den_yr'.dta, clear
  
    * Does coverage start BEFORE the month of the day before individual turns 65? (covstart < mdy(month(bene_dob - 1), 1, year(bene_dob - 1) + 65))
    * Does individual turn 65 by end of the year? (age <= 65) 
    gen byte nonelderly_count = (covstart_count < mdy(month(bene_dob - 1), 1, year(bene_dob - 1) + 65)) & (age <= 65) 
    gen byte nonelderly_buyin = (covstart_buyin < mdy(month(bene_dob - 1), 1, year(bene_dob - 1) + 65)) & (age <= 65) 
    label var nonelderly_count "Medicare coverage started before age 65 eligibility, based on covstart_count"
    label var nonelderly_buyin "Medicare coverage started before age 65 eligibility, based on covstart_buyin"

    * Concordance between values
    keep if nonelderly_count | nonelderly_buyin
    local N_any_nonelderly = _N
    count if nonelderly_count & nonelderly_buyin
    local N_all_nonelderly = `r(N)'
    di `N_all_nonelderly'/`N_any_nonelderly'
    * QC: for years where nonelderly_buyin is defined (early years do not have buyin variables)
    sum nonelderly_buyin
    if `r(max)' > 0 assert `N_all_nonelderly'/`N_any_nonelderly' > 0.9999
    noisily di _N

    * Den/mbsf flag
    gen file_vintage = "den"
    label var file_vintage "File vintage (denominator or mbsf)"

    save "$SSDIMed/data/temp/tmp_di_den_`den_yr'", replace
  }

  * MBSF files
  qui foreach mbsf_yr in 1999 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 {
    * local mbsf_yr 2017
    noisily di "Working on year `mbsf_yr'"

    local keepvars bene_id rfrnc_yr age race sex covstart* orec *bic bene_dob bene_zip5
    use_pbzip2 `keepvars' using $SSDIMed/data/proc/medicare/100pct/bsf/bsfab`mbsf_yr'.dta, clear

    * Does coverage start BEFORE the month of the day before individual turns 65? (covstart < mdy(month(bene_dob - 1), 1, year(bene_dob - 1) + 65))
    * Does individual turn 65 by end of the year? (age <= 65) 
    gen byte nonelderly_admin = (covstart < mdy(month(bene_dob - 1), 1, year(bene_dob - 1) + 65)) & (age <= 65) 
    gen byte nonelderly_count = (covstart_count < mdy(month(bene_dob - 1), 1, year(bene_dob - 1) + 65)) & (age <= 65) 
    gen byte nonelderly_buyin = (covstart_buyin < mdy(month(bene_dob - 1), 1, year(bene_dob - 1) + 65)) & (age <= 65) 
    label var nonelderly_count "Medicare coverage started before age 65 eligibility, based on covstart_count"
    label var nonelderly_buyin "Medicare coverage started before age 65 eligibility, based on covstart_buyin"
    label var nonelderly_admin "Medicare coverage started before age 65 eligibility, based on admin covstart"

    * Concordance between values
    keep if nonelderly_admin | nonelderly_count | nonelderly_buyin
    local N_any_nonelderly = _N
    count if nonelderly_admin & nonelderly_count & nonelderly_buyin
    local N_all_nonelderly = `r(N)'
    di `N_all_nonelderly'/`N_any_nonelderly'
    assert `N_all_nonelderly'/`N_any_nonelderly' > 0.994
    noisily di _N

    * Den/mbsf flag
    gen file_vintage = "mbsf"
    label var file_vintage "File vintage (denominator or mbsf)"

    save "$SSDIMed/data/temp/tmp_di_bsf_`mbsf_yr'", replace
  }


  * ------------------------------------------------------------------
  * Limit to first year each bene observed, by file vintage
  * ------------------------------------------------------------------

  * Denominator files: one obs per person in first year observed
  clear
  qui forvalues den_yr = 1992/2008 {
    noisily di "Adding denominator file for year `den_yr'"
    append using "$SSDIMed/data/temp/tmp_di_den_`den_yr'.dta"
    sort bene_id rfrnc_yr

    * Keep bene_ids in first year they appear in Medicare eligibility files
    by bene_id (rfrnc_yr): keep if _n == 1
  }
  tab rfrnc_yr
  gisid bene_id
  save "$SSDIMed/data/temp/tmp_di_den.dta", replace

  * MBSF files: one obs per person in first year observed
  clear
  qui foreach mbsf_yr in 1999 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 {
    noisily di "Adding MBSF file for year `mbsf_yr'"
    append using "$SSDIMed/data/temp/tmp_di_bsf_`mbsf_yr'.dta"
    sort bene_id rfrnc_yr

    * Keep bene_ids in first year they appear in Medicare eligibility files
    by bene_id (rfrnc_yr): keep if _n == 1
  }
  tab rfrnc_yr
  gisid bene_id
  save "$SSDIMed/data/temp/tmp_di_bsf.dta", replace


  * ------------------------------------------------------------------
  * Initial covstart for all elderly and non-elderly in MBSF files 
  *   Will be used to merge in covstart info, including for people 
  *   in denominator years who may be >=65 by the time they reach
  *   an MBSF file year
  * ------------------------------------------------------------------
  qui foreach mbsf_yr in 1999 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 {
    * local mbsf_yr 1999
    noisily di "Working on year `mbsf_yr'"

    local keepvars bene_id rfrnc_yr covstart
    use_pbzip2 `keepvars' using $SSDIMed/data/proc/medicare/100pct/bsf/bsfab`mbsf_yr'.dta, clear
    gisid bene_id

    save "$SSDIMed/data/temp/tmp_di_covstart_`mbsf_yr'", replace
  }
  clear
  qui foreach mbsf_yr in 1999 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 {
    noisily di "Appending year `mbsf_yr'"
    append using "$SSDIMed/data/temp/tmp_di_covstart_`mbsf_yr'"
  }
  drop if missing(covstart)
  gegen min_rfrnc_yr = min(rfrnc_yr), by(bene_id)
  keep if rfrnc_yr == min_rfrnc_yr
  drop rfrnc_yr min_rfrnc_yr
  sort bene_id
  gisid bene_id
  save "$SSDIMed/data/temp/tmp_di_covstart", replace
  
  * ------------------------------------------------------------------
  * Initial directory of nonelderly benes, 1993 - 2017
  * ------------------------------------------------------------------

  * Build file of disabled benes in first year in the Medicare data
  *   Note: - based on denominator files for 1993-2006 (since denominator is available beginning 1992)
  *         - based on MBSF files for 2007-2017 (since MBSF available consecutively beginning 2006)
  clear
  append using "$SSDIMed/data/temp/tmp_di_den.dta"
  append using "$SSDIMed/data/temp/tmp_di_bsf.dta"
  gisid bene_id file_vintage
  tab rfrnc_yr file_vintage
  keep if (file_vintage == "den" & inrange(rfrnc_yr, 1993, 2006)) | (file_vintage == "mbsf" & inrange(rfrnc_yr, 2007, 2017))
  sort bene_id rfrnc_yr
  by bene_id (rfrnc_yr): keep if _n==1
  gisid bene_id
  tab rfrnc_yr file_vintage
  assert _N == 17826105


  * ------------------------------------------------------------------
  * Construct coverage start date (may != initial year observed)
  * ------------------------------------------------------------------

  * Replace missing values of admin covstart with values for all bene_ids for whom we ever have it reported
  assert missing(covstart) if (file_vintage == "den")
  rename covstart covstart_init
  merge 1:1 bene_id using "$SSDIMed/data/temp/tmp_di_covstart", keep(match master) keepusing(covstart) nogen 
  replace covstart_init = covstart if missing(covstart_init)
  drop covstart
  rename covstart_init covstart
  assert _N == 17826105

  * QC:
  gen mbsf_year = (inlist(rfrnc_yr, 1999) | inrange(rfrnc_yr, 2006, 2017))
  label var mbsf_year "Year for which administrative covstart variable is available"
  gegen has_covstart = mean(!missing(covstart)), by(rfrnc_yr)
  gstats tab has_covstart, by(rfrnc_yr)
  sum has_covstart if mbsf_year == 1
  assert (round(`r(min)', 0.001) == 0.998) & (`r(max)' == 1)
  sum has_covstart if mbsf_year == 0
  assert abs(round(`r(min)', 0.001) - 0.821) < 1e-14 & abs(round(`r(max)', 0.001) - 0.982) < 1e-14
  drop has_covstart

  * QC: inferrered covstart variables (covstart_buyin, covstart_count) are similar, when both are defined
  assert year(covstart_count) == rfrnc_yr
  assert !missing(covstart_count)
  count if !missing(covstart_buyin)
  local N_has_covstart_buyin = `r(N)'
  count if covstart_buyin == covstart_count & !missing(covstart_buyin)
  assert `r(N)'/`N_has_covstart_buyin' > 0.997

  * Drop covstart/nonelderly status based on buyin, since it's not defined for all years
  assert missing(covstart_buyin) == (rfrnc_yr <= 1995)
  drop covstart_buyin nonelderly_buyin
  count if !inlist(1, nonelderly_count, nonelderly_admin)
  assert `r(N)' == 20
  keep if inlist(1, nonelderly_count, nonelderly_admin)
  assert _N == 17826085
  drop nonelderly_count nonelderly_admin

  * QC: Show alignment between admin covstart and covstart_count
  count if (mbsf_year == 0) & !missing(covstart)
  local N_denom = `r(N)'
  count if covstart==covstart_count & (mbsf_year == 0) & !missing(covstart)
  assert round(`r(N)'/`N_denom', 0.001) == .894

  gen year_covstart = year(covstart)
  gen year_covstart_count = year(covstart_count)
  assert year_covstart_count == rfrnc_yr
  tab year_covstart rfrnc_yr
  tab year_covstart if year_covstart_count==1993, miss
  tab year_covstart if year_covstart_count==1999, miss
  tab year_covstart if year_covstart_count==2017, miss
  drop year_covstart year_covstart_count

  * Create a covstart variable that is defined for everyone: start with admin variable in denominator vintages, fill with covstart_count
  gen covstart_fill = covstart if file_vintage == "mbsf"
  replace covstart_fill = covstart_count if missing(covstart_fill)
  assert !missing(covstart_fill)
  label var covstart_fill "Admin covstart when available; otherwise covstart_count"
  format %d covstart_fill

  count if covstart_count < covstart_fill
  di round(`r(N)'/_N, 0.00001)
  assert abs(round(`r(N)'/_N, 0.00001) - .00008) < 1e-14
  count if covstart_fill < covstart_count
  di round(`r(N)'/_N, 0.00001)
  assert abs(round(`r(N)'/_N, 0.00001) - .01194) < 1e-14


  * ------------------------------------------------------------------
  * Keep individuals ages 20-64 at time of coverage start
  * ------------------------------------------------------------------

  * Age at coverage start
  cap drop age_????_covstart*
  gen age_mofd_covstart_fill = (mofd(covstart_fill) - mofd(bene_dob)) / 12
  gen age_year_covstart_fill = floor(age_mofd_covstart_fill)
  * label var age_mofd_covstart_fill "(Age in months)/12 since gaining Medicare coverage (covstart_fill)"
  * label var age_year_covstart_fill "Age in years since gaining Medicare coverage (covstart_fill)"
  label var age_mofd_covstart_fill "(Age in months)/12 at Medicare coverage start (covstart_fill)"
  label var age_year_covstart_fill "Age in years at Medicare coverage start (covstart_fill)"

  * Keep sample of individuals who are
  * 1) between ages 20-64 at time of covstart, based on covstart_fill
  * 2) gained eligibility in sample period 1993-2017
  keep if inrange(age_year_covstart_fill, 20, 64) 
  assert _N == 17826085 - 35838
  assert _N == 17790247
  sum rfrnc_yr
  assert `r(min)' == 1993 & `r(max)' == 2017
  keep if inrange(year(covstart_fill), 1993, 2017)
  assert _N == 17790247 - 89602
  assert _N == 17700645

  * Distribution of age at coverage start
  tab age_mofd_covstart_fill if inrange(age_year_covstart_fill, 40, 65) & (mbsf_year == 1) 
  tab age_mofd_covstart_fill if inrange(age_year_covstart_fill, 40, 65) & (mbsf_year == 0) 
  tab age_year_covstart_fill if (mbsf_year == 1) 
  tab age_year_covstart_fill if (mbsf_year == 0) 


  * ------------------------------------------------------------------
  * Merge in modal county and state associated with ZIP5
  * ------------------------------------------------------------------
  merge m:1 bene_zip5 using "$SSDIMed/data/proc/medicare/cw/zip_cty_st_unique_1992-2017", keep(match master) keepusing(fipscounty state) nogen
  rename bene_zip5 bene_zip5_init
  rename fipscounty fipscounty_init
  rename state state_init
  label var bene_zip5_init "Original 5-digit ZIP Code, 1st year observed in Medicare elig. files"
  label var fipscounty_init "Original 5-digit FIPS county, 1st year observed in Medicare elig. files"
  label var state_init "Original 2-char State abbrev, 1st year observed in Medicare elig. files"

  * Rename variables to reflect initial conditions
  rename bene_dob bene_dob_init
  rename age age_init
  rename race race_init
  rename sex sex_init
  rename crnt_bic crnt_bic_init
  rename orec orec_init
  rename covstart covstart_init
  rename rfrnc_yr init_yr
  rename mbsf_year mbsf_year_init
  rename file_vintage file_vintage_init
  label var bene_dob_init "Bene date of birth, 1st year observed in Medicare elig. files"
  label var age_init "Age at end of the year, 1st year observed in Medicare elig. files"
  label var race_init "Race, 1st year observed in Medicare elig. files"
  label var sex_init "Sex, 1st year observed in Medicare elig. files"
  label var crnt_bic_init "Bene Identification Code, 1st year observed in Medicare elig. files"
  label var orec_init "Original Reason for Entitlement Code, 1st year observed in Medicare elig. files"
  label var covstart_init "covstart, 1st year observed in Medicare elig. files"
  label var init_yr "Initial eligibitiliy file year observed"
  label var mbsf_year_init "Administrative covstart variable available in init_yr"
  label var file_vintage_init "File vintage (denominator or mbsf) of initial eligibitiliy file bene is observed"

  * Label original entitlement variables
  label define orec 0 "OASI" 1 "DIB" 2 "ESRD" 3 "DIB and ESRD" 4 "Unknown", replace
  label values orec_init orec

  * Order and save
  compress
  * assert _N == 17700645
  order bene_id init_yr file_vintage_init bene_dob_init age_init race_init sex_init crnt_bic_init orec_init 
  order bene_zip5_init fipscounty_init state_init, after(sex_init) 
  order covstart_fill covstart_init covstart_count mbsf_year_init age_mofd_covstart_fill age_year_covstart_fill, after(state_init)
  bys bene_id: assert _N == 1
  save $SSDIMed/data/proc/medicare/master_cohort, replace
  
  * Cleanup temporary files
  if 1 {
    forvalues den_yr = 1992/2008 {
      rm "$SSDIMed/data/temp/tmp_di_den_`den_yr'.dta"
    }
    foreach mbsf_yr in 1999 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 {
      rm "$SSDIMed/data/temp/tmp_di_bsf_`mbsf_yr'.dta"
    }
    foreach mbsf_yr in 1999 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 {
      rm "$SSDIMed/data/temp/tmp_di_covstart_`mbsf_yr'.dta"
    }
    rm "$SSDIMed/data/temp/tmp_di_covstart.dta"
    rm "$SSDIMed/data/temp/tmp_di_den.dta"
    rm "$SSDIMed/data/temp/tmp_di_bsf.dta"
  }



  * ------------------------------------------------------------------
  * Sample checks
  * ------------------------------------------------------------------

  * 1. Frequency table: covstart_fill year vs. initial year in Medicare eligibility file
  use $SSDIMed/data/proc/medicare/master_cohort, clear

  * QC:
  assert _N == 17700645
  sum init_yr
  assert `r(min)' == 1993 & `r(max)' == 2017
  gen covstart_fill_yr = year(covstart_fill)
  sum covstart_fill_yr
  assert `r(min)' == 1993 & `r(max)' == 2017

  tab covstart_fill_yr init_yr


  * 2. Frequency table: cohort, by covstart year or init_year
  use $SSDIMed/data/proc/medicare/master_cohort, clear

  * QC:
  assert _N == 17700645
  gen covstart_fill_yr = year(covstart_fill)
  sum covstart_fill_yr
  assert `r(min)' == 1993 & `r(max)' == 2017

  tab covstart_fill_yr
  tab init_yr


  * 3. Frequency table: age at coverage start
  use $SSDIMed/data/proc/medicare/master_cohort, clear

  * QC:
  assert _N == 17700645
}


* 4. ENROLLMENT + ESRD (DI). Compile enrollment (incl. hmo/buyin), ESRD, entitlement reason, and ZIP5/county info each year on or after year of initial Medicare enrollment
if 1 {
  qui forvalues file_yr = 1993/2017 {
    * local file_yr 2007
    
    * Load master cohort file
    use $SSDIMed/data/proc/medicare/master_cohort, clear
    
    * Merge master cohort file with enrollment info in each year
    local keepvars rfrnc_yr bene_dob death_dt bene_zip5 esrd_ind crnt_bic crec orec hmo_mo buyin_mo a_mo_cnt b_mo_cnt
    assert inrange(init_yr, 1993, 2006) == (file_vintage_init == "den")
    if inrange(`file_yr', 1993, 2006) {
      noisily di "Working on denominator file year `file_yr'"
      merge_pbzip2 1:1 bene_id using "$SSDIMed/data/proc/medicare/100pct/den/den`file_yr'.dta.bz2", keep(match) keepusing(`keepvars') nogen noreport
    }
    else if inrange(`file_yr', 2007, 2017) {
      noisily di "Working on MBSF file year `file_yr'"
      merge_pbzip2 1:1 bene_id using "$SSDIMed/data/proc/medicare/100pct/bsf/bsfab`file_yr'.dta.bz2", keep(match) keepusing(`keepvars') nogen noreport
    }
    
    * Obtain entitlement/esrd indicators from harmonized files, when available, otherwise the den/bsf files used to create the sample
    rename (crec orec esrd_ind) (crec_bkp orec_bkp esrd_ind_bkp)
    local keepvars crec orec esrd_ind
    merge_pbzip2 1:1 bene_id using "$SSDIMed/data/raw/medicare/100pct/harm/bsfab`file_yr'.dta.bz2", keep(match master) keepusing(`keepvars') nogen noreport
    replace crec = string(crec_bkp) if missing(crec)
    replace orec = string(orec_bkp) if missing(orec)
    replace esrd_ind = esrd_ind_bkp if missing(esrd_ind)
    drop *_bkp
    
    * Keep only rfrnc_yr obs on or after init_yr
    keep if init_yr <= rfrnc_yr
    
    * Crosswalk current zip to current county
    assert bene_zip5_init == bene_zip5 if init_yr==rfrnc_yr
    merge m:1 bene_zip5 using "$SSDIMed/data/proc/medicare/cw/zip_cty_st_unique_1992-2017", keep(match master) keepusing(fipscounty) nogen
    
    save "$SSDIMed/data/temp/tmp_enroll_panel_`file_yr'", replace
  }
  
  clear
  forvalues file_yr = 1993/2017 {
    append using "$SSDIMed/data/temp/tmp_enroll_panel_`file_yr'"
  }
  sort bene_id rfrnc_yr
  
  * Mark individuals enrolled only in FFS during year
  gen byte ffs_only = (hmo_mo == 0)
  label variable ffs_only "Enrolled only in FFS this year (hmo_mo==0)"
  
  * Mark any state buy-in of Medicare premiums for individuals also enrolled in a state Medicaid program 
  gen byte buyin_any = (buyin_mo > 0) if !missing(buyin_mo)
  label variable buyin_any "State buy-in for Medicaid enrollee (buyin_mo > 0)"
  
  * Convert ESRD indicator to dummy
  gen byte esrd = lower(esrd_ind) == "y"
  drop esrd_ind
  label variable esrd "End-stage renal disease indicator (from SSA)"
  
  * QC:  BIC (nonmissing)
  assert missing(crnt_bic_init) == inrange(init_yr,  2002, 2005)
  assert missing(crnt_bic)      == inrange(rfrnc_yr, 2002, 2005)
  
  * Label reasons for entitlement
  destring orec crec, replace
  label define rec 0 "OASI" 1 "DIB" 2 "ESRD" 3 "DIB and ESRD" 4 "Unknown", replace
  label values orec crec rec
  
  * Replace death_dt with its first nonmissing value
  sort bene_id rfrnc_yr
  gegen death_dt_firstnm = firstnm(death_dt), by(bene_id)
  count if !missing(death_dt)
  local N_nmdeath = `r(N)'
  count if death_dt_firstnm != death_dt & !missing(death_dt)
  di round(`r(N)'/`N_nmdeath', 0.00001)
  assert abs(round(`r(N)'/`N_nmdeath', 0.00001) - .00017) < 1e-14
  drop death_dt
  label var death_dt_firstnm "Beneficiary Date of Death, first non-missing value"
  
  * Generate first nonmissing fipscounty observation
  sort bene_id rfrnc_yr
  gegen fipscounty_firstnm_num = firstnm(real(fipscounty)), by(bene_id)
  gen fipscounty_firstnm = string(fipscounty_firstnm_num, "%05.0f")
  replace fipscounty_firstnm = "" if fipscounty_firstnm == "."
  drop fipscounty_firstnm_num
  label var fipscounty_firstnm "Beneficiary county of residence, first non-missing value"
  
  * QC: fipscounty firstnm and init should be equal if init is nonmissing
  assert strlen(fipscounty_firstnm) == 5 if !missing(fipscounty_firstnm)
  assert fipscounty_firstnm == fipscounty_init if !missing(fipscounty_init)
  
  * QC: should have same # of benes as in master_cohort file
  qui desc using $SSDIMed/data/proc/medicare/master_cohort
  local N_master_cohort = `r(N)'
  bys bene_id rfrnc_yr: assert _N==1
  by bene_id (rfrnc_yr): gen flag = (_n == 1)
  count if flag == 1
  assert `r(N)' == `N_master_cohort'
  drop flag
  
  * Order variables
  order fipscounty_firstnm fipscounty, after(fipscounty_init)
  order bene_zip5, after(bene_zip5_init)
  order orec crec esrd, after(rfrnc_yr)
  
  * Save
  compress
  gisid bene_id rfrnc_yr
  assert _N == 172164325
  saveold "$SSDIMed/data/proc/medicare/master_cohort_enroll.dta", replace
  
  * Cleanup
  forvalues file_yr = 1993/2017 {
    rm "$SSDIMed/data/temp/tmp_enroll_panel_`file_yr'.dta"
  }
}


* 5. SPENDING (DI). Create panel of cost/utilization for Baseline Population 1999-2013
* Note: the annual cost/utilization files begin in 1999
if 1 {
  qui forvalues mbsf_yr = 1999/2017 {
    * local mbsf_yr 2012
    noisily di "Calculating for year `mbsf_yr'"
    
    * Load benes from baseline population, in given year
    use if rfrnc_yr==`mbsf_yr' using "$SSDIMed/data/proc/medicare/master_cohort_enroll.dta", clear
  
    * Gather the following (non-overlapping) use and spending variables from cost and use segment
    * Note that beginning in 2012, primary payer and inpatient per diem payments are added
    * -------------------------------------------------------------------------------------------------------
    local usevars
    *acute_cov_days     acute inpatient covered days
    *acute_stays        acute inpatient stays
    *readmissions       hospital readmissions
    *ip_er_visits       inpatient emergency room visits
    *oip_cov_days       other inpatient covered days
    *oip_stays          other inpatient stays
    *snf_cov_days       skilled nursing facility covered days
    *snf_stays          skilled nursing facility stays
    *hos_cov_days       hospice covered days
    *hos_stays          hospice stays
    *hh_visits          home health visits
    
    local spendingvars
    *bene_id            Encrypted 723 Beneficiary ID
    *hop_mdcr_pmt       Hospital Outpatient Medicare Payments
    *hop_bene_pmt       Hospital Outpatient Beneficiary Payments
    *acute_mdcr_pmt     Acute Inpatient Medicare Payments
    *acute_bene_pmt     Acute Inpatient Beneficiary Payments
    *oip_mdcr_pmt       Other Inpatient Medicare Payments
    *oip_bene_pmt       Other Inpatient Beneficiary Payments
    *snf_mdcr_pmt       Skilled Nursing Facility Medicare Payments
    *snf_bene_pmt       Skilled Nursing Facility Beneficiary Payments
    *hos_mdcr_pmt       Hospice Medicare Payments
    *hh_mdcr_pmt        Home Health Medicare Payments
    *asc_mdcr_pmt       Ambulatory Surgery Center Medicare Payments
    *asc_bene_pmt       Ambulatory Surgery Center Beneficiary Payments
    *ptb_drug_mdcr_pmt  Part B Drug Medicare Payments
    *ptb_drug_bene_pmt  Part B Drug Beneficiary Payments
    *em_mdcr_pmt        Evaluation and Management Medicare Payments
    *em_bene_pmt        Evaluation and Management Beneficiary Payments
    *anes_mdcr_pmt      Anesthesia Medicare Payments
    *anes_bene_pmt      Anesthesia Beneficiary Payments
    *dialys_mdcr_pmt    Dialysis Medicare Payments
    *dialys_bene_pmt    Dialysis Beneficiary Payments
    *oproc_mdcr_pmt     Other Procedures Medicare Payments
    *oproc_bene_pmt     Other Procedures Beneficiary Payments
    *img_mdcr_pmt       Imaging Medicare Payments
    *img_bene_pmt       Imaging Beneficiary Payments
    *test_mdcr_pmt      Tests Medicare Payments
    *test_bene_pmt      Tests Beneficiary Payments
    *dme_mdcr_pmt       Durable Medical Equipment Medicare Payments
    *dme_bene_pmt       Durable Medical Equipment Beneficiary Payments
    *othc_mdcr_pmt      Other Part B Carrier Medicare Payments
    *othc_bene_pmt      Other Part B Carrier Beneficiary Payments
    *phys_mdcr_pmt      Part B Physician Medicare Payments
    *phys_bene_pmt      Part B Physician Beneficiary Payments
    *ptd_mdcr_pmt       Part D Medicare Payments
    *ptd_bene_pmt       Part D Beneficiary Payments
    
    local othervars
    * hop_prmry_pmt         hospital outpatient primary payer payments
    * acute_prmry_pmt       acute inpatient primary payer payments
    * acute_perdiem_pmt     acute inpatient per diem payments
    * oip_prmry_pmt         other inpatient primary payer payments
    * oip_perdiem_pmt       other inpatient per diem payments
    * snf_prmry_pmt         skilled nursing facility primary payer payments
    * hos_prmry_pmt         hospice primary payer payments
    * hh_prmry_pmt          home health primary payer payments
    * asc_prmry_pmt         ambulatory surgery center primary payer payments
    * ptb_drug_prmry_pmt    part b drug primary payer payments
    * em_prmry_pmt          evaluation and management primary payer payments
    * anes_prmry_pmt        anesthesia primary payer payments
    * dialys_prmry_pmt      dialysis primary payer payments
    * oproc_prmry_pmt       other procedures primary payer payments
    * img_prmry_pmt         imaging primary payer payments
    * test_prmry_pmt        tests primary payer payments
    * dme_prmry_pmt         durable medical equipment primary payer payments
    * othc_prmry_pmt        other part b carrier primary payer payments
    * phys_prmry_pmt        part b physician primary payer payments
    
    if      inrange(`mbsf_yr', 1999, 2012) {
      local usevars `usevars' acute_cov_days
      local usevars `usevars' acute_stays   
      local usevars `usevars' readmissions  
      local usevars `usevars' ip_er_visits  
      local usevars `usevars' oip_cov_days  
      local usevars `usevars' oip_stays     
      local usevars `usevars' snf_cov_days  
      local usevars `usevars' snf_stays     
      local usevars `usevars' hos_cov_days  
      local usevars `usevars' hos_stays     
      local usevars `usevars' hh_visits     
      
      local spendingvars `spendingvars' hop_mdcr_pmt    hop_bene_pmt 
      local spendingvars `spendingvars' acute_mdcr_pmt  acute_bene_pmt
      local spendingvars `spendingvars' oip_mdcr_pmt    oip_bene_pmt 
      local spendingvars `spendingvars' snf_mdcr_pmt    snf_bene_pmt 
      local spendingvars `spendingvars' hos_mdcr_pmt    hh_mdcr_pmt 
      local spendingvars `spendingvars' asc_mdcr_pmt    asc_bene_pmt 
      local spendingvars `spendingvars' ptb_drug_mdcr_pmt ptb_drug_bene_pmt 
      local spendingvars `spendingvars' em_mdcr_pmt     em_bene_pmt 
      local spendingvars `spendingvars' anes_mdcr_pmt   anes_bene_pmt 
      local spendingvars `spendingvars' dialys_mdcr_pmt   dialys_bene_pmt 
      local spendingvars `spendingvars' oproc_mdcr_pmt  oproc_bene_pmt 
      local spendingvars `spendingvars' img_mdcr_pmt    img_bene_pmt 
      local spendingvars `spendingvars' test_mdcr_pmt   test_bene_pmt 
      local spendingvars `spendingvars' dme_mdcr_pmt    dme_bene_pmt 
      local spendingvars `spendingvars' othc_mdcr_pmt   othc_bene_pmt 
      local spendingvars `spendingvars' phys_mdcr_pmt   phys_bene_pmt 
      
      local othervars `othervars' hop_prmry_pmt     
      local othervars `othervars' acute_prmry_pmt   
      local othervars `othervars' acute_perdiem_pmt 
      local othervars `othervars' oip_prmry_pmt     
      local othervars `othervars' oip_perdiem_pmt   
      local othervars `othervars' snf_prmry_pmt     
      local othervars `othervars' hos_prmry_pmt     
      local othervars `othervars' hh_prmry_pmt      
      local othervars `othervars' asc_prmry_pmt     
      local othervars `othervars' ptb_drug_prmry_pmt
      local othervars `othervars' em_prmry_pmt      
      local othervars `othervars' anes_prmry_pmt    
      local othervars `othervars' dialys_prmry_pmt  
      local othervars `othervars' oproc_prmry_pmt   
      local othervars `othervars' img_prmry_pmt     
      local othervars `othervars' test_prmry_pmt    
      local othervars `othervars' dme_prmry_pmt     
      local othervars `othervars' othc_prmry_pmt    
      local othervars `othervars' phys_prmry_pmt    
    }
    else if inrange(`mbsf_yr', 2013, 2017) {
      local usevars `usevars' acute_co
      local usevars `usevars' acute_st
      local usevars `usevars' readmiss
      local usevars `usevars' ip_er_vi
      local usevars `usevars' oip_cov_
      local usevars `usevars' oip_stay
      local usevars `usevars' snf_cov_
      local usevars `usevars' snf_stay
      local usevars `usevars' hos_cov_
      local usevars `usevars' hos_stay
      local usevars `usevars' hh_visit
      
      local spendingvars `spendingvars' hop_mdcr      hop_bene
      local spendingvars `spendingvars' acute_md      acute_be
      local spendingvars `spendingvars' oip_mdcr      oip_bene
      local spendingvars `spendingvars' snf_mdcr      snf_bene
      local spendingvars `spendingvars' hos_mdcr      hh_mdcr_
      local spendingvars `spendingvars' asc_mdcr      asc_bene
      local spendingvars `spendingvars' ptbrxmp       ptbrxbp
      local spendingvars `spendingvars' em_mdcr_      em_bene_
      local spendingvars `spendingvars' anes_mdc      anes_ben
      local spendingvars `spendingvars' dialys_m      dialys_b
      local spendingvars `spendingvars' oproc_md      oproc_be
      local spendingvars `spendingvars' img_mdcr      img_bene
      local spendingvars `spendingvars' test_mdc      test_ben
      local spendingvars `spendingvars' dme_mdcr      dme_bene
      local spendingvars `spendingvars' othc_mdc      othc_ben
      local spendingvars `spendingvars' phys_mdc      phys_ben
      
      local othervars `othervars' 
      local othervars `othervars' hop_prmr
      local othervars `othervars' acute_pr
      local othervars `othervars' acute_pe
      local othervars `othervars' oip_prmr
      local othervars `othervars' oip_perd
      local othervars `othervars' snf_prmr
      local othervars `othervars' hos_prmr
      local othervars `othervars' hh_prmry
      local othervars `othervars' asc_prmr
      local othervars `othervars' ptbrxpp 
      local othervars `othervars' em_prmry
      local othervars `othervars' anes_prm
      local othervars `othervars' dialys_p
      local othervars `othervars' oproc_pr
      local othervars `othervars' img_prmr
      local othervars `othervars' test_prm
      local othervars `othervars' dme_prmr
      local othervars `othervars' othc_prm
      local othervars `othervars' phys_prm
    }
    
    * Exclude Part D for now
    * local spendingvars `spendingvars' ptd_mdcr_pmt ptd_bene_pmt 
    di "`usevars'"
    di "`spendingvars'"
    di "`othervars'"
    
    * In 1999 (bsf), there are very few bene_ids (some missing) with multiple records in the cost and use segment
    * I have checked the multiple record cases that match bene_ids in the clean panelbenes file; in each case, multiple 
    * records are just duplicates, in the sense that total spending is equal across multiple records for the same bene_id.
    if `mbsf_yr'==1999 {
      gisid bene_id
      merge_pbzip2 1:m bene_id using "$SSDIMed/data/raw/medicare/100pct/bsf/bsfcu`mbsf_yr'.dta.bz2", keep(match master) keepusing(bene_id `usevars' `spendingvars')
      bys bene_id: keep if _n==1
    }
    else if inrange(`mbsf_yr', 2000, 2011) {
      merge_pbzip2 1:1 bene_id using "$SSDIMed/data/raw/medicare/100pct/bsf/bsfcu`mbsf_yr'.dta.bz2", keep(match master) keepusing(bene_id `usevars' `spendingvars')
    }
    else if inrange(`mbsf_yr', 2012, 2017) {
      merge_pbzip2 1:1 bene_id using "$SSDIMed/data/raw/medicare/100pct/bsf/bsfcu`mbsf_yr'.dta.bz2", keep(match master) keepusing(bene_id `usevars' `spendingvars' `othervars')
    }
    
    * Bene_id uniquely identifies observations
    gisid bene_id
    
    * Flag for matching to MBSF CU segment
    * Important for distinguishing between $0 spending and no data
    assert inlist(_merge, 1, 3) 
    gen byte in_mbsf_cu = (_merge ==3 )
    label var in_mbsf_cu "Indicates if bene_id matched to MBSF CU record in rfrnc_yr"
    drop _merge
    tab in_mbsf_cu
    
    * Rename shortened variable names starting in 2013
    if inrange(`mbsf_yr', 2013, 2017) {
      * utilization variables
      rename acute_co  acute_cov_days
      rename acute_st  acute_stays   
      rename readmiss  readmissions  
      rename ip_er_vi  ip_er_visits  
      rename oip_cov_  oip_cov_days  
      rename oip_stay  oip_stays     
      rename snf_cov_  snf_cov_days  
      rename snf_stay  snf_stays     
      rename hos_cov_  hos_cov_days  
      rename hos_stay  hos_stays     
      rename hh_visit  hh_visits
      local usevars
      local usevars `usevars' acute_cov_days
      local usevars `usevars' acute_stays   
      local usevars `usevars' readmissions  
      local usevars `usevars' ip_er_visits  
      local usevars `usevars' oip_cov_days  
      local usevars `usevars' oip_stays     
      local usevars `usevars' snf_cov_days  
      local usevars `usevars' snf_stays     
      local usevars `usevars' hos_cov_days  
      local usevars `usevars' hos_stays     
      local usevars `usevars' hh_visits
      
      * spending variables
      rename hop_mdcr hop_mdcr_pmt
      rename hop_bene hop_bene_pmt
      rename acute_md acute_mdcr_pmt
      rename acute_be acute_bene_pmt
      rename oip_mdcr oip_mdcr_pmt
      rename oip_bene oip_bene_pmt
      rename snf_mdcr snf_mdcr_pmt
      rename snf_bene snf_bene_pmt
      rename hos_mdcr hos_mdcr_pmt
      rename hh_mdcr_ hh_mdcr_pmt
      rename asc_mdcr asc_mdcr_pmt
      rename asc_bene asc_bene_pmt
      rename ptbrxmp ptb_drug_mdcr_pmt
      rename ptbrxbp ptb_drug_bene_pmt
      rename em_mdcr_ em_mdcr_pmt
      rename em_bene_ em_bene_pmt
      rename anes_mdc anes_mdcr_pmt
      rename anes_ben anes_bene_pmt
      rename dialys_m dialys_mdcr_pmt
      rename dialys_b dialys_bene_pmt
      rename oproc_md oproc_mdcr_pmt
      rename oproc_be oproc_bene_pmt
      rename img_mdcr img_mdcr_pmt
      rename img_bene img_bene_pmt
      rename test_mdc test_mdcr_pmt
      rename test_ben test_bene_pmt
      rename dme_mdcr dme_mdcr_pmt
      rename dme_bene dme_bene_pmt
      rename othc_mdc othc_mdcr_pmt
      rename othc_ben othc_bene_pmt
      rename phys_mdc phys_mdcr_pmt
      rename phys_ben phys_bene_pmt
      local spendingvars
      local spendingvars `spendingvars' hop_mdcr_pmt    hop_bene_pmt 
      local spendingvars `spendingvars' acute_mdcr_pmt  acute_bene_pmt
      local spendingvars `spendingvars' oip_mdcr_pmt    oip_bene_pmt 
      local spendingvars `spendingvars' snf_mdcr_pmt    snf_bene_pmt 
      local spendingvars `spendingvars' hos_mdcr_pmt    hh_mdcr_pmt 
      local spendingvars `spendingvars' asc_mdcr_pmt    asc_bene_pmt 
      local spendingvars `spendingvars' ptb_drug_mdcr_pmt ptb_drug_bene_pmt 
      local spendingvars `spendingvars' em_mdcr_pmt     em_bene_pmt 
      local spendingvars `spendingvars' anes_mdcr_pmt   anes_bene_pmt 
      local spendingvars `spendingvars' dialys_mdcr_pmt   dialys_bene_pmt 
      local spendingvars `spendingvars' oproc_mdcr_pmt  oproc_bene_pmt 
      local spendingvars `spendingvars' img_mdcr_pmt    img_bene_pmt 
      local spendingvars `spendingvars' test_mdcr_pmt   test_bene_pmt 
      local spendingvars `spendingvars' dme_mdcr_pmt    dme_bene_pmt 
      local spendingvars `spendingvars' othc_mdcr_pmt   othc_bene_pmt 
      local spendingvars `spendingvars' phys_mdcr_pmt   phys_bene_pmt 
      
      rename hop_prmr  hop_prmry_pmt     
      rename acute_pr  acute_prmry_pmt   
      rename acute_pe  acute_perdiem_pmt 
      rename oip_prmr  oip_prmry_pmt     
      rename oip_perd  oip_perdiem_pmt   
      rename snf_prmr  snf_prmry_pmt     
      rename hos_prmr  hos_prmry_pmt     
      rename hh_prmry  hh_prmry_pmt      
      rename asc_prmr  asc_prmry_pmt     
      rename ptbrxpp   ptb_drug_prmry_pmt
      rename em_prmry  em_prmry_pmt      
      rename anes_prm  anes_prmry_pmt    
      rename dialys_p  dialys_prmry_pmt  
      rename oproc_pr  oproc_prmry_pmt   
      rename img_prmr  img_prmry_pmt     
      rename test_prm  test_prmry_pmt    
      rename dme_prmr  dme_prmry_pmt     
      rename othc_prm  othc_prmry_pmt    
      rename phys_prm  phys_prmry_pmt    
      local othervars    
      local othervars `othervars' hop_prmry_pmt     
      local othervars `othervars' acute_prmry_pmt   
      local othervars `othervars' acute_perdiem_pmt 
      local othervars `othervars' oip_prmry_pmt     
      local othervars `othervars' oip_perdiem_pmt   
      local othervars `othervars' snf_prmry_pmt     
      local othervars `othervars' hos_prmry_pmt     
      local othervars `othervars' hh_prmry_pmt      
      local othervars `othervars' asc_prmry_pmt     
      local othervars `othervars' ptb_drug_prmry_pmt
      local othervars `othervars' em_prmry_pmt      
      local othervars `othervars' anes_prmry_pmt    
      local othervars `othervars' dialys_prmry_pmt  
      local othervars `othervars' oproc_prmry_pmt   
      local othervars `othervars' img_prmry_pmt     
      local othervars `othervars' test_prmry_pmt    
      local othervars `othervars' dme_prmry_pmt     
      local othervars `othervars' othc_prmry_pmt    
      local othervars `othervars' phys_prmry_pmt   
    }
    
    * Clean missing use payment values
    *   Replace missings with 0 for individuals with MBSF CU records
    *   Keep values as missing if individual has no MBSF CU record
    foreach var in `usevars' `spendingvars' `othervars' {
      capture confirm variable `var'
      if _rc == 0 {
        di "Cleaning missing values of `var'"
        replace `var'=0 if missing(`var') & (in_mbsf_cu == 1)
      }
    }
    
    * Sum over all spending categories to get total annual spending
    local sum_pmts 0
    foreach pmt in `spendingvars' {
      local sum_pmts `sum_pmts' + `pmt'
    }
    di "`sum_pmts'"
    gen tot_pmt = `sum_pmts'
    label var tot_pmt "Total Medicare and beneficiary payments"
    
    * Part B: ptb_drug + home health care + durable medical equipment + doctors + outpatient care
    * (part of tot_pmt)
    gen ptb_pmt = ptb_drug_mdcr_pmt + ptb_drug_bene_pmt + hh_mdcr_pmt + dme_mdcr_pmt + dme_bene_pmt + phys_mdcr_pmt + phys_bene_pmt + hop_mdcr_pmt + hop_bene_pmt
    label var ptb_pmt "Medicare Part B payments"
    
    * Other payments (primary payer and per diem
    * Sum over all spending categories to get total annual spending
    gen float other_pmt = .
    local sum_pmts 0
    foreach pmt in `othervars' {
      local sum_pmts `sum_pmts' + `pmt'
    }
    di "`sum_pmts'"
    cap replace other_pmt = `sum_pmts'
    label var other_pmt "Primary payer and Per diem payments"
    
    * QC:
    assert missing(tot_pmt) == (in_mbsf_cu == 0)
    assert missing(ptb_pmt) == (in_mbsf_cu == 0)
    assert missing(other_pmt) if rfrnc_yr <= 2011
    assert missing(other_pmt) == (in_mbsf_cu == 0) if rfrnc_yr >= 2012
    
    * Keep select use/spending variables, and save this year of data
    keep bene_id rfrnc_yr in_mbsf_cu tot_pmt ptb_pmt other_pmt `usevars'
    saveold "$SSDIMed/data/temp/baseyr`base_yr'_rfrncyr`mbsf_yr'.dta", replace
  }
  
  * Append all years of use/spending for individuals
  clear
  forvalues mbsf_yr = 1999/2017 {
    append using "$SSDIMed/data/temp/baseyr`base_yr'_rfrncyr`mbsf_yr'.dta"
  }
  
  compress
  bys bene_id rfrnc_yr: assert _N == 1
  preserve
  use if rfrnc_yr >= 1999 using "$SSDIMed/data/proc/medicare/master_cohort_enroll.dta", clear
  assert _N == 160277710
  restore
  assert _N == 160277710
  saveold "$SSDIMed/data/proc/medicare/master_cohort_enroll_pmt.dta", replace
  
  * Cleanup
  forvalues mbsf_yr = 1999/2017 {
    rm "$SSDIMed/data/temp/baseyr`base_yr'_rfrncyr`mbsf_yr'.dta"
  }
}





** EOF
